In [ ]:
from IPython.display import Image, display, HTML
Image("images/munich.jpg")
In [ ]:
display(HTML("<table><tr><td><p><b>Rain Princess - Leonid Afremov</b></p><img src='images/princess.jpeg'></td><td><b><p>Munich + Rain Princess + Machine Learning</b></p><img src='images/munich-princess-out.jpg'></td></tr></table>"))
In [ ]:
display(HTML("<table><tr><td><p><b>The Great Wave off Kanagawa - Katsushika Hokusai</b></p><img src='images/wave.jpg'></td><td><b><p>Munich + The Great Wave + Machine Learning</b></p><img src='images/munich-wave-out.jpg'></td></tr></table>"))
In [ ]:
display(HTML("<table><tr><td><p><b>La Muse - Pablo Picaso</b></p><img src='images/muse.jpg'></td><td><b><p>Munich + La Muse + Machine Learning</b></p><img src='images/munich-muse-out.jpg'></td></tr></table>"))
In [ ]:
display(HTML("<table><tr><td><p><b>Udnie - Francis Picabia</b></p><img src='images/udnie.jpg'></td><td><b><p>Munich + Udnie + Machine Learning</b></p><img src='images/munich-udnie-out.jpg'></td></tr></table>"))
In [ ]:
display(HTML("<table><tr><td><b><p>Scream - Edvard Munch</b></p><img src='images/scream.jpg'></td><td><b><p>Munich + Scream + Machine Learning</b></p><img src='images/munich-scream-out.jpg'></td></tr></table>"))
In [ ]:
display(HTML("<table><tr><td><p><b>The Shipwreck of the Minotaur - Joseph Mallord William Turner</b></p><img src='images/wreck.jpg'></td><td><b><p>Munich + Shipwreck + Machine Learning</b></p><img src='images/munich-wreck-out.jpg'></td></tr></table>"))
In [ ]:
# A bit about MNIST dataset
from tensorflow.examples.tutorials.mnist import input_data
data = input_data.read_data_sets("data/MNIST/", one_hot=True)
import numpy as np
from scipy.stats import norm
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
from pandas.tools.plotting import scatter_matrix
import seaborn as sns
In [ ]:
%matplotlib inline
data.test.cls = np.array([label.argmax() for label in data.test.labels])
# We know that MNIST images are 28 pixels in each dimension.
img_size = 28
# Images are stored in one-dimensional arrays of this length.
img_size_flat = img_size * img_size
# Tuple with height and width of images used to reshape arrays.
img_shape = (img_size, img_size)
# Number of classes, one class for each of 10 digits.
num_classes = 10
def plot_images(images, cls_true, cls_pred=None):
assert len(images) == len(cls_true) == 9
# Create figure with 3x3 sub-plots.
fig, axes = plt.subplots(3, 3)
fig.subplots_adjust(hspace=0.3, wspace=0.3)
for i, ax in enumerate(axes.flat):
# Plot image.
ax.imshow(images[i].reshape(img_shape), cmap='binary')
# Show true and predicted classes.
if cls_pred is None:
xlabel = "True: {0}".format(cls_true[i])
else:
xlabel = "True: {0}, Pred: {1}".format(cls_true[i], cls_pred[i])
ax.set_xlabel(xlabel)
# Remove ticks from the plot.
ax.set_xticks([])
ax.set_yticks([])
# Get the first images from the test-set.
images = data.test.images[0:9]
# Get the true classes for those images.
cls_true = data.test.cls[0:9]
# Plot the images and labels using our helper-function above.
plot_images(images=images, cls_true=cls_true)
In [ ]:
# String
string = 'Machine learning '
string2 = ' dojo '
string3 = ' part I'
print string + string2 + string3
print 'String variable type is: {}'.format(type(string))
In [ ]:
# Integers
number = 10
number2 = 20
number3 = 30
print number + number2 + number3
print 'number variable type is: {}'.format(type(number))
In [ ]:
# Booleans
boolean = True
boolean2 = True
boolean3 = False
print boolean and boolean2 or boolean3
print 'bolean variable type is: {}'.format(type(boolean))
In [ ]:
# Floating point numbers
floating = 3.14
floating2 = 2.79
floating3 = 10.01
print floating + floating2 + floating3
print 'floating variable type is: {}'.format(type(floating))
In [ ]:
if 10 > 8:
print '10 is greater than 8.'
print '10 is greater than 8.'
print '10 is greater than 8.'
In [ ]:
a = True
b = 10
c = 20
print 'first if statement...'
if b < c and a:
print 'All fine.'
else:
print 'Not all fine.'
print 'second if statement...'
if b < c and (not a):
print 'All fine.'
else:
print 'Not all fine.'
In [ ]:
if 10 > 20:
message = "if only 10 were greater than 20"
elif 10 > 30:
message = "elif means 'else if'"
else:
message = "when all else fails use else "
message
In [ ]:
for i in [1, 2, 3, 4, 5]:
print i
In [ ]:
for x in range(5):
if x == 3:
continue # go immediately to the next iteration
if x == 5:
break # quit the loop entirely
print x
In [ ]:
x = 0
while x < 5:
print x, "is less than 5"
x += 1
In [ ]:
a = True
x = 0
while a:
print x, "is less than 10"
x += 1
if x >= 10:
a = False
In [215]:
# Lists
numbers = [1, 4, 9, 16, 25]
In [ ]:
numbers[:]
In [ ]:
numbers[:2]
In [ ]:
numbers[2:]
In [ ]:
type(numbers)
In [ ]:
letters = ['a', 'b', 'c', 'd', 'e', 'f', 'g']
len(letters)
In [ ]:
letters[2]
In [ ]:
a = [66.25, 333, 333, 1, 1234.5]
a
In [ ]:
a.count(333), a.count(66.25), a.count('x')
In [ ]:
a.insert(2, -1)
a
In [ ]:
a.append(333)
a
In [ ]:
a.index(333)
In [ ]:
a.remove(333)
a
In [ ]:
a.reverse()
a
In [ ]:
a.sort()
a
In [ ]:
a.pop()
In [ ]:
a
In [232]:
# dictionaires
phones = {'Spiderman': 151984858, 'Me': 151234324}
In [ ]:
phones['Superman'] = 15104928
phones
In [ ]:
phones['Spiderman']
In [ ]:
del phones['Me']
phones
In [ ]:
phones['Batman'] = 15123545
phones
In [ ]:
phones.keys()
In [ ]:
'Ken' in phones
In [215]:
# tuples
In [ ]:
tuple = 31213, 123453, 'hi Ml!'
tuple
In [ ]:
tuple[0]
In [ ]:
tuple[2]
In [ ]:
tuple[1] = 1234
In [ ]:
tupleTheSecond = tuple, (1, 2, 3, 4, 5)
tupleTheSecond
In [244]:
t1, t2 = tupleTheSecond
In [ ]:
t1
In [ ]:
t2
In [ ]:
for i, j in zip (t1, t2):
print i, j
In [ ]:
type(t1)
In [ ]:
# sets
In [249]:
basket = ['apple', 'orange', 'apple', 'pear', 'orange', 'banana']
In [250]:
fruit = set(basket)
In [ ]:
fruit
In [ ]:
'orange' in fruit
In [ ]:
'plum' in fruit
In [254]:
# import panda library
import pandas as pd
In [ ]:
# Show version of panda library
print pd.__version__
In [ ]:
# it is all about describing the data
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
import numpy as np
def randrange(n, vmin, vmax):
'''
Helper function to make an array of random numbers having shape (n, )
with each number distributed Uniform(vmin, vmax).
'''
return (vmax - vmin)*np.random.rand(n) + vmin
fig = plt.figure(figsize=(14, 12))
ax = fig.add_subplot(111, projection='3d')
n = 100
# For each set of style and range settings, plot n random points in the box
# defined by x in [23, 32], y in [0, 100], z in [zlow, zhigh].
for c, m, zlow, zhigh in [('r', 'o', -50, -25), ('b', '^', -30, -5)]:
xs = randrange(n, 23, 32)
ys = randrange(n, 0, 100)
zs = randrange(n, zlow, zhigh)
ax.scatter(xs, ys, zs, c=c, marker=m)
ax.set_xlabel('X Label')
ax.set_ylabel('Y Label')
ax.set_zlabel('Z Label')
plt.show()
Dataset features are selfexplanatory. Dataset is taken from Kaggle website
In [257]:
# read csv file
nn = pd.read_csv('kc_house_data.csv')
In [ ]:
# top 5 data records
nn.head(10)
In [ ]:
# check are there any null values in any of the columns
nn.isnull().any()
len(nn)
In [262]:
# add one record with NaN values
nn = nn.append({'id':'12345', 'price':'12345.23'}, ignore_index=True)
In [ ]:
len(nn)
In [ ]:
# check number of NaN values in some column
len(nn[nn.bedrooms.isnull()])
In [ ]:
# show list of the records where column bedrooms contain NaN values
nn[nn.bedrooms.isnull()]
In [266]:
# drop NaN values
nn = nn.dropna()
In [ ]:
# check number of NaN records after droping NaNs
len(nn[nn.bedrooms.isnull()])
len(nn)
In [ ]:
nn.describe()
In [ ]:
foot_to_meter_ratio = 0.092903
nn['sqm2_living']=nn['sqft_living'] * foot_to_meter_ratio
nn['sqm2_living'] = nn['sqm2_living'].round(0)
nn['sqm2_lot']=nn['sqft_lot'] * foot_to_meter_ratio
nn['sqm2_lot'] = nn['sqm2_lot'].round(0)
# show all columns
pd.set_option("display.max_columns",99)
pd.set_option("display.max_rows",999)
nn.head()
In [ ]:
nn['sqm2_basement'] = nn['sqft_basement'].map(lambda x: round(x * foot_to_meter_ratio, 0))
nn.head()
In [ ]:
nn['price_low'] = 0
condition = nn['price'] < 100000
nn.loc[condition, 'price_low'] = 1
nn.loc[~condition, 'price_low'] = 0
nn['price_low'].value_counts()
In [ ]:
new = nn[(nn['price'] < 100000)]
new
In [ ]:
nn['bedrooms'].value_counts()
In [ ]:
counts = nn.groupby('bedrooms').size()
counts
In [ ]:
# check waterfront column values
nn['waterfront'].value_counts()
In [ ]:
# select all properties with waterfront
waterfront = nn[(nn['waterfront'] == 1)]
waterfront
In [ ]:
waterfront_1_room = nn[(nn['waterfront'] == 1) & (nn['bedrooms'] == 1)]
waterfront_1_room
In [ ]:
waterfront.describe()
In [303]:
plt.figure(figsize=(10, 5))
plt.hist(nn['bedrooms'],normed=False)
plt.show()
In [ ]:
plt.figure(figsize=(10, 5))
plt.hist(nn['price'],normed=False)
plt.show()
In [ ]:
plt.figure(figsize=(10, 5))
plt.hist(nn['sqft_living'],normed=False)
plt.show()
In [ ]:
plt.figure(figsize=(10, 5))
plt.hist(nn['sqft_lot'],normed=False)
plt.show()
In [ ]:
def colorFunction(x):
if x == 0:
return 'black'
elif x == 1:
return 'brown'
elif x == 2:
return 'red'
elif x == 3:
return 'blue'
elif x == 4:
return 'green'
elif x == 5:
return 'pink'
elif x == 6:
return 'orange'
elif x ==7:
return 'cyan'
elif x ==8:
return 'yellow'
elif x == 9:
return 'magenta'
else:
return 'pink'
nn['color'] = nn['bedrooms'].apply(colorFunction)
figure = plt.figure()
subplot = figure.add_subplot(111)
scatter = subplot.scatter(nn['long'], nn['lat'], s=10, c=nn['color'])
subplot.set_xlabel('Longitude')
subplot.set_ylabel('Latitude')
figure.set_figheight(10)
figure.set_figwidth(15)
plt.show()
In [ ]:
features = nn.drop(['id','price','date','color'], axis = 1)
# Using pyplot
plt.figure(figsize=(20, 55))
# i: index
for i, col in enumerate(features.columns):
# 3 plots here hence 1, 3
plt.subplot(10, 3, i+1)
x = nn[col]
y = nn['price']
plt.plot(x, y, 'o')
# Create regression line
plt.plot(np.unique(x), np.poly1d(np.polyfit(x, y, 1))(np.unique(x)))
plt.title(col)
plt.xlabel(col)
plt.ylabel('prices')
plt.show()
In [ ]:
# best fit of data
(mu, sigma) = norm.fit(nn['price'])
# the histogram of the data
n, bins, patches = plt.hist(nn['price'], 60, normed=True, facecolor='green', alpha=0.75)
# add a 'best fit' line
y = mlab.normpdf(bins, mu, sigma)
l = plt.plot(bins, y, 'r--', linewidth=2)
#plot
plt.xlabel('Sales prices')
plt.ylabel('Probability')
plt.title(r'$\mathrm{Histogram\ of\ IQ:}\ \mu=%.3f,\ \sigma=%.3f$' %(mu, sigma))
plt.grid(True)
plt.show()
In [ ]:
# plot the heatmap
nn = pd.read_csv('kc_house_data.csv')
nn = nn.drop(['id'], axis=1)
plt.figure(figsize=(14, 12))
sns.heatmap(nn.corr())
In [ ]:
# showing correlations in the table
cmap = cmap=sns.diverging_palette(5, 250, as_cmap=True)
def magnify():
return [dict(selector="th",
props=[("font-size", "7pt")]),
dict(selector="td",
props=[('padding', "0em 0em")]),
dict(selector="th:hover",
props=[("font-size", "12pt")]),
dict(selector="tr:hover td:hover",
props=[('max-width', '200px'),
('font-size', '12pt')])
]
nn.corr().style.background_gradient(cmap, axis=1)\
.set_properties(**{'max-width': '80px', 'font-size': '10pt'})\
.set_caption("Hover to magify")\
.set_precision(2)\
.set_table_styles(magnify())
In [ ]:
Here's a brief version of what you'll find in the data description file.
More about this data set can be found on Kaggle website.
In [ ]:
# read csv data
data = pd.read_csv('housing_train.csv')
# describe dataset
data.describe()
In [ ]:
# show first 5 records in the dataset
data.head()
In [ ]:
# show last 5 records in the dataset
data.tail()
In [239]:
# row selection from 10-15 record
dataTemp = data[0:15]
In [ ]:
# iteration over rows
for row in dataTemp.iterrows():
print row[1]['SalePrice']
In [ ]:
data['Lambda'] = data['SalePrice'].apply(lambda x: x * 1.1)
dataTemp = data[0:15]
dataTemp[::3]
In [243]:
columns = ['SalePrice', 'LotArea', '1stFlrSF', '2ndFlrSF', 'BedroomAbvGr', 'YrSold']
data = data[columns]
In [ ]:
plt.figure(figsize=(10, 5))
plt.hist(data['SalePrice'],normed=False)
plt.show()
plt.figure(figsize=(10, 5))
plt.hist(data['LotArea'],normed=False)
plt.show()
plt.figure(figsize=(10, 5))
plt.hist(data['BedroomAbvGr'],normed=False)
plt.show()
In [ ]:
len(data['SalePrice'])
In [246]:
# Data filtering
dataFiltering = data[['SalePrice', 'BedroomAbvGr','LotArea']].copy()
In [ ]:
dataFiltering.head()
In [ ]:
# Hadling NaN values
original = pd.read_csv('housing_train.csv')
#original.isnull().any()
original.loc[:, original.isnull().any()]
In [ ]:
original.dropna(subset=["LotFrontage"]) # option 1
original.drop("LotFrontage", axis=1) # option 2
median = housing["LotFrontage"].median()
original["LotFrontage"].fillna(median) # option 3
In [ ]:
# Mention ~ operator
count = original[(original["MSZoning"].str.contains('RL'))]
len(count)
In [ ]:
# best fit of data
(mu, sigma) = norm.fit(data['SalePrice'])
# the histogram of the data
n, bins, patches = plt.hist(data['SalePrice'], 60, normed=True, facecolor='green', alpha=0.75)
# add a 'best fit' line
y = mlab.normpdf( bins, mu, sigma)
l = plt.plot(bins, y, 'r--', linewidth=2)
#plot
plt.xlabel('Sales prices')
plt.ylabel('Probability')
plt.title(r'$\mathrm{Histogram\ of\ IQ:}\ \mu=%.3f,\ \sigma=%.3f$' %(mu, sigma))
plt.grid(True)
plt.show()
In [ ]:
prices = data['SalePrice']
features = data.drop('SalePrice', axis = 1)
# i: index
for i, col in enumerate(features.columns):
plt.figure(figsize=(20, 35))
# 3 plots here hence 1, 3
plt.subplot(5, 1, i+1)
x = data[col]
y = prices
plt.plot(x, y, 'o')
# Create regression line
plt.plot(np.unique(x), np.poly1d(np.polyfit(x, y, 1))(np.unique(x)))
plt.title(col)
plt.xlabel(col)
plt.ylabel('prices')
plt.show()
In [ ]:
foot_to_meter_ratio = 0.092903
data['LotAream2']=data['LotArea'] * foot_to_meter_ratio
data['LotAream2'] = data['LotAream2'].round(0)
data.head()
In [ ]:
x = data['LotAream2']
y = prices
plt.figure(figsize=(20, 10))
plt.plot(x, y, 'o')
# Create regression line
plt.plot(np.unique(x), np.poly1d(np.polyfit(x, y, 1))(np.unique(x)))
plt.title(x.name)
plt.xlabel(x.name)
plt.ylabel('prices')
plt.show()
In [ ]:
# Creating smaller data set and filter it
dataM2 = data[['SalePrice', 'LotAream2']].copy()
low = .05
high = .9
quant_df = dataM2.quantile([low, high])
print(quant_df)
In [258]:
dataM2 = dataM2.apply(lambda x: x[(x > quant_df.loc[low, x.name]) & (x < quant_df.loc[high, x.name])], axis=0)
In [ ]:
dataM2.head()
In [ ]:
len(dataM2['SalePrice'])
In [260]:
dataM2['BedroomAbvGr']=data['BedroomAbvGr'].copy()
dataM2.head()
dataM2 = dataM2.dropna()
In [ ]:
x = dataM2['LotAream2']
y = dataM2['SalePrice']
plt.figure(figsize=(20, 15))
plt.plot(x, y, 'o')
# Create regression line
plt.plot(np.unique(x), np.poly1d(np.polyfit(x, y, 1))(np.unique(x)))
plt.title(x.name)
plt.xlabel(x.name)
plt.ylabel('prices')
plt.show()
In [ ]:
dataM2["BedroomAbvGr"].value_counts()
In [ ]:
color = [str(item*270/255.) for item in dataM2["BedroomAbvGr"]]
figure = plt.figure()
subplot = figure.add_subplot(111)
scatter = subplot.scatter(dataM2['LotAream2'], dataM2['SalePrice'], s=50, c=color)
subplot.set_xlabel('Lot in m2')
subplot.set_ylabel('Price')
plt.colorbar(scatter)
figure.set_figheight(10)
figure.set_figwidth(15)
plt.show()
In [170]:
# Correlation matrix
corr_matrix = data.corr()
In [ ]:
corr_matrix["SalePrice"].sort_values(ascending=False)
In [ ]:
attributes = ["SalePrice", "LotAream2", "BedroomAbvGr", "1stFlrSF", "2ndFlrSF"]
scatter_matrix(data[attributes], figsize=(15, 15))
data.plot(kind="scatter", x="LotAream2", y="SalePrice",alpha=0.1)
plt.show()
In [173]:
# a bit more data filtering
df = data[['SalePrice', 'LotAream2', 'BedroomAbvGr']].copy()
In [ ]:
df.head()
len(df)
In [264]:
filtered = df.drop(
df.index[(df['LotAream2'] > (df['LotAream2'].mean() + 3 * df['LotAream2'].std()))])
In [ ]:
x = filtered['LotAream2']
y = filtered['SalePrice']
plt.figure(figsize=(20, 15))
plt.plot(x, y, 'o')
# Create regression line
plt.plot(np.unique(x), np.poly1d(np.polyfit(x, y, 1))(np.unique(x)))
plt.title(x.name)
plt.xlabel(x.name)
plt.ylabel('prices')
plt.show()
In [182]:
filtered.describe()
In [ ]:
data.describe()
In [ ]:
counts = filtered.groupby('BedroomAbvGr').size()
counts.head()
In [ ]:
filtered = df.drop(
df.index[(df['LotAream2'] > (df['LotAream2'].mean() + 3 * df['LotAream2'].std()))])